###########################################################################################
################ Load libraries

library(tm)
library(gdata)
library(mosaic)
library(gsubfn)
library(quanteda)
library(quanteda.textmodels)
library(xtable)
library(ranger)
library(openxlsx)
library(pROC)
library(glmnet)
library(tidyr)
library(caret)
options(stringsAsFactors = FALSE)


###########################################################################################
################ Significance Cross-validation: 
################ Selecting model to use

## ROC and AUC here: 1 curve for each of several significance thresholds

load("train_test_data.RData")

# Set the k-fold vector (and seed)
set.seed(10012)
idx = sample(x=1:10, size=nrow(chiou2), replace=T)
cross_val_rf = function(x, thresh){
  ytest = as.numeric(chiou2$Significance>thresh)
  train = chiou_subset[idx!=x,]
  train = quanteda::convert(train, to="data.frame")
  test = chiou_subset[idx==x,]
  test = quanteda::convert(test, to="data.frame")
  train$outcome = factor(ytest[idx!=x])
  m = ranger::ranger(dependent.variable.name="outcome", data=train,
                     probability=FALSE, write.forest=TRUE, num.trees=2000)
  preds = predict(m, test, predict.all=TRUE)
  p2 = rowMeans(preds$predictions)
  out = cor(p2, ytest[idx == x]) 
  print(out)
  out2 = list(predvec = p2, truth = ytest[idx==x])
  return(out2)
}

get_roc = function(thresh=.5){
  cv_ests = sapply(1:10, FUN=function(x) cross_val_rf(x, thresh)) 
  preds = unlist(cv_ests[c(1,3,5,7,9,11,13,15,17,19)])
  truth =  unlist(cv_ests[c(2,4,6,8,10,12,14,16,18,20)])
  out = roc(response=truth, predictor = preds)
  return(out)
}
cv_roc = get_roc()
cv_roc # 0.9056


get_ridge_auc = function(thresh = 0.5){
  ytest = as.numeric(chiou2$Significance>thresh)
  train = quanteda::convert(chiou_subset, to="matrix")
  cv.ridge <- cv.glmnet(train, factor(ytest), family='binomial',
                        alpha=0, parallel=FALSE, standardize=TRUE, type.measure='auc')
  return(cv.ridge)
}

ridge = get_ridge_auc()
plot(ridge)
max(ridge$cvm) # 0.919
lambda_opt = ridge$lambda.min 
lambda_opt # 1.53




###########################################################################################
################ Generate a significance estimating model

load("train_test_data.RData")
set.seed(10012)

temp = quanteda::convert(chiou_subset, to="data.frame")
temp$outcome=factor(as.numeric(chiou2$Significance>.5))

dim(tdm1) # 101186,30229
tdm1.1 = quanteda::convert(tdm1[1:50000,], to="matrix")
tdm1.2 =  quanteda::convert(tdm1[50001:nrow(tdm1),], to="matrix")

tdm1 = rbind(tdm1.1,tdm1.2)

rm(tdm1.1)
rm(tdm1.2)
gc()

m = ranger(dependent.variable.name="outcome", data=temp[,-1],
           probability=TRUE, write.forest=TRUE, num.trees=50000)
test_rf = predict(m, tdm1)

m2 = glmnet( quanteda::convert(chiou_subset, to="matrix"),
             factor(as.numeric(chiou2$Significance>0.5)),
             family="binomial", alpha=0, standardize=TRUE, lambda=1.53)
preds2.1 = predict(m2, tdm1[1:50000,], type="response")
preds2.2 = predict(m2, tdm1[50001:nrow(tdm1),], type="response")

preds2 = rbind(preds2.1, preds2.2)
rm(preds2.1)
rm(preds2.2)
gc()


track_df2 = merge(track_df2, chiou2, all.x=T)
track_df2$traintest = ifelse(is.na(track_df2$Significance), "test", "train") 

track_df2$Significance_rf = test_rf$predictions[,2]
track_df2$Significance_log = preds2[,1]

# conduct simple analyses to inspect the estimates
cor(track_df2$Significance_log > 0.5, track_df2$Significance_rf >.787) # 0.267
summ =track_df2 %>% filter(doctype == "EO" & traintest == "test") %>% group_by(year) %>% summarize(mean(Significance>.78))
test = track_df2[track_df2$traintest == "train",]
cor(test$Significance, test$Significance_rf) # 0.675
cor(test$Significance, test$Significance_log) # 0.549
mean((test$Significance > 0.5) == (test$Significance_log > 0.5)) # 0.896
mean((test$Significance > 0.5) == (test$Significance_rf > 0.787)) # 0.877

confusionMatrix(factor(test$Significance > 0.5), factor(test$Significance_log > 0.2)) # to equalize false positive and false negative rates
confusionMatrix(factor(test$Significance > 0.5), factor(test$Significance_rf > 0.355))
## RF performs much better

## Returning to the output file
track_df3 = track_df2
track_df3$Significance_rf[!is.na(track_df3$Significance)] = track_df3$Significance[!is.na(track_df3$Significance)]
track_df3$Significance_log[!is.na(track_df3$Significance)] = track_df3$Significance[!is.na(track_df3$Significance)]


# Put the proclamations back in
load("procs.RData")
pr_df$accession = toupper(pr_df$accession)
toadd = pr_df[!pr_df$accession %in% track_df2$accession,]
toadd$traintest = "train"
toadd$Significance_log = toadd$Significance
toadd$Significance_rf = toadd$Significance

track_df3 = plyr::rbind.fill(track_df2, toadd)

## in-text figures
table(track_df3$traintest) # 88412, 11439
nrow(track_df3[track_df3$year > 1932 & track_df3$traintest=="test",]) # 46866
nrow(track_df3[track_df3$year > 1932 & track_df3$traintest=="train",]) ## 11384

roc(response=track_df3$Significance[track_df3$traintest=="train"] > 0.5,
    predictor = track_df3$Significance_rf[track_df3$traintest=="train"])
# 0.9440


## face validity
accessions = c("2016-PR-9835", "2016-53-105",
               "2016-58-2153", "2016-EO-13724",
               "2016-57-042", "2016-57-002",
               "2016-03-7857", "2016-eo-13756",
               "2016-56-001", "2016-EO-13726")
track_df3[track_df3$accession %in% accessions,]



###########################################################################################
################ Topic Cross-validation

## Merge the training labels
load("exec_action_1900_2019.RData")

# Merge by accession
out = merge(track_df3,dat,by="accession", all.x=T)
# remove merge errors
out2 = out[!duplicated(out$accession),]

rm(out)

## In-text number: how many training labels?
sum(!is.na(out2$policyarea)) # 21429
table(out2$doctype[!is.na(out2$policyarea)])


# Clean up
ids = intersect(rownames(tdm1), out2$docname)
out3 = out2[out2$docname %in% ids,]
tdm2 = tdm1[rownames(tdm1) %in% ids,]
out3$id = 1:nrow(out3)

gdata::keep(out3, tdm2, sure=T)


## Train a multi-class classifier
### First, separate the policyarea variable and melt then rebalance

# remove policy area 11 (not used in contemporary policy agendas)
out3$policyarea[grepl(x=out3$policyarea, pattern="11")] = NA

out_temp = out3[,c("accession", "policyarea", "id")]
out_temp = out_temp %>% separate(policyarea, into=paste("policy", 1:10, sep=""), sep=",| ") %>%
  reshape2::melt(c("accession", "id")) %>%
  mutate(val2 = as.numeric(value))  %>% 
  filter(!is.na(val2)) %>% filter(val2 <= 21)


# Get train labels in wide format
out_temp2 = out_temp
out_temp2$value = as.numeric(as.character(out_temp2$value))
out_temp2 = reshape2::dcast(out_temp2, accession + id ~ value)
out_temp3 = out_temp2[,-c(1,2)]
out_temp3[!is.na(out_temp3)] = 1
out_temp3[is.na(out_temp3)] = 0
out_temp4 = as.data.frame(t(scale(t(out_temp3), center=FALSE, scale=rowSums(out_temp3))))
out_temp4$accession = out_temp2$accession
out_temp4$id = out_temp2$id


# Rebalance

idx = setdiff(1:nrow(out3),out_temp4$id)

train.x = tdm2[out_temp$id,]
test.x = tdm2[idx,]
train.y = out_temp$val2

train.x2 = as.data.frame(train.x)
train.y2 = as.factor(train.y)
temp = cbind(train.y2, train.x2)
train.x2 = UBL::AdasynClassif(form = train.y2 ~ ., dat=temp, k=4)

summary(train.x2$train.y2)

gdata::keep(train.x2, train.y2, test.x, out3, out_temp4, tdm2, sure=T)

save.image(file="tmp_policy_classifier.RData")

set.seed(10012)
idx = sample(x=1:10, size=nrow(train.x2), replace=T)
cross_val_rf = function(x){
  train = train.x2[idx!=x,]
  test = train.x2[idx==x,]
  m = ranger::ranger(dependent.variable.name="train.y2", data=train,
                     max.depth = 4,
                     probability=TRUE, write.forest=TRUE, num.trees=5000)
  preds = predict(m, test, n.trees=5000, type="response")
  preds = preds$predictions
  out2 = list(predvec = preds, truth = test$train.y2)
  return(out2)
}

get_roc = function(){
  cv_ests = lapply(sort(unique(idx)), FUN=function(x) cross_val_rf(x)) 
  preds = sapply(cv_ests, FUN=function(x) x$predvec)
  preds = do.call(rbind, preds)
  truth = sapply(cv_ests, FUN=function(x) x$truth)
  truth = unlist(truth)
  out = list(preds, truth)
  return(out)
}
#train.x = convert(train.x, to="data.frame")
cv_out = get_roc()
cv_roc = pROC::multiclass.roc(response = cv_out[[2]],
                              predictor = cv_out[[1]])
cv_roc # 0.8535


###########################################################################################
################ Topic Classifier
set.seed(10012)

m = ranger::ranger(dependent.variable.name="train.y2", data=train.x2,
                   probability=TRUE, write.forest=TRUE, num.trees=5000,
                   importance = "impurity")
test_rf = predict(m, tdm2) # predict on everything


# Add the predictions back into the master data frame
preds = test_rf$predictions 

preds_rank = t(apply(FUN = rank, X = preds, MARGIN=1,
                     ties.method = "random"))

out3$policyarea_1 = sapply(1:nrow(preds_rank),
                           FUN = function(x){
                             colnames(preds_rank)[which(preds_rank[x,]==20)]
                           })
out3$policyarea_2 = sapply(1:nrow(preds_rank),
                           FUN = function(x){
                             colnames(preds_rank)[which(preds_rank[x,]==19)]
                           })

out3$policyarea_set = sapply(1:nrow(preds),
                             FUN = function(x){
                               paste0(colnames(preds)[which(preds[x,]>=0.14)],collapse=",")
                             })

table(out3$policyarea_1)
table(out3$policyarea_2)


###########################################################################################
################ Clean up the final product and save

out4 = out3 %>% mutate(year = year.x) %>%
  select(accession, year, doctype, file, docnum,
         docname, id, date, Significance, traintest,
         Significance_rf, policyarea_1, policyarea_2, policyarea_set)

track_df3 = out4

save(track_df3, file="unilateral_preds.RData")
write.csv(track_df3, file="unilateral_preds.csv")


## Explore
track_df3 %>% filter(policyarea_set == "16,18,19,20")

summary(sapply(track_df3$policyarea_set, nchar)) # mean 3.521

mean(track_df3$policyarea_set=="") # 0.057




##### Heteroskedasticity
##### (see Appendix B.6)

load("train_test_data.RData")

# Set the k-fold vector (and seed)
set.seed(10012)
idx = sample(x=1:10, size=nrow(chiou2), replace=T)
cross_val_rf = function(x, thresh = 0.5){
  ytest = as.numeric(chiou2$Significance>thresh)
  train = chiou_subset[idx!=x,]
  train = quanteda::convert(train, to="data.frame")
  test = chiou_subset[idx==x,]
  test = quanteda::convert(test, to="data.frame")
  train$outcome = factor(ytest[idx!=x])
  m = ranger::ranger(dependent.variable.name="outcome", data=train,
                     probability=FALSE, write.forest=TRUE, num.trees=2000)
  preds = predict(m, test, predict.all=TRUE)
  p2 = rowMeans(preds$predictions)
  out2 = list(predvec = p2, truth = chiou2$Significance[idx==x])
  return(out2)
}

cv_ests = sapply(1:10, FUN=function(x) cross_val_rf(x, thresh = 0.5)) 
preds = unlist(cv_ests[c(1,3,5,7,9,11,13,15,17,19)])
truth =  unlist(cv_ests[c(2,4,6,8,10,12,14,16,18,20)])

## heteroskedasticity
cor(preds, abs(truth-preds)) # -0.02
plot(preds, abs(truth-preds))


##### Temporal Validity
##### (see Appendix B.7)

set.seed(10012)

idx = cut(1:nrow(chiou2), 10)
cross_val_rf = function(x){
  train = train.x2[idx!=x,]
  test = train.x2[idx==x,]
  m = ranger::ranger(dependent.variable.name="train.y2", data=train,
                     max.depth = 4,
                     probability=TRUE, write.forest=TRUE, num.trees=5000)
  preds = predict(m, test, n.trees=5000, type="response")
  preds = preds$predictions
  out2 = list(predvec = preds, truth = test$train.y2)
  return(out2)
}
temporal_cv_out = get_roc()
temporal_cv_roc = pROC::multiclass.roc(response = temporal_cv_out[[2]],
                                       predictor = temporal_cv_out[[1]])
temporal_cv_roc # 0.499



## Threshold analysis
## (see Appendix B.9)

load("train_test_data.RData")

threshes = seq(0, 1.5, by=.1)

thresh_test = function(thresh){
  ytest = as.numeric(chiou2$Significance>thresh)
  train = chiou_subset
  train = quanteda::convert(train, to="data.frame")
  train$outcome = factor(ytest)
  m = ranger::ranger(dependent.variable.name="outcome", data=train[,-1],
                     probability=FALSE, write.forest=TRUE, num.trees=2000)
  preds = predict(m, tdm1, predict.all=TRUE)
  return(preds) 
}

options(expressions = 5e5)
thresh_tests = lapply(threshes, thresh_test)

out = do.call(cbind, thresh_tests)
colnames(out) = paste("Significance", threshes, sep="_")
track_df2 = merge(track_df2, chiou2, all=T)
track_df2 = track_df2[!duplicated(track_df2$id),]

for(i in 1:length(colnames(out))){
  track_df2[colnames(out)[i]] <- track_df2$Significance
  track_df2[colnames(out)[i]] = out[,i]
}
save(track_df2, file="threshold_test_significance_probs.RData")
